library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
immune <- read_csv("kindergarten_CA.csv")
## Parsed with column specification:
## cols(
## district = col_character(),
## sch_code = col_double(),
## county = col_character(),
## pub_priv = col_character(),
## school = col_character(),
## enrollment = col_double(),
## complete = col_double(),
## start_year = col_double()
## )
str(immune)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 110382 obs. of 8 variables:
## $ district : chr "Alameda Unified" "Alameda Unified" "Alameda Unified" "Alameda Unified" ...
## $ sch_code : num 6967434 6110779 6100374 6090013 6090039 ...
## $ county : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ pub_priv : chr "Private" "Public" "Public" "Public" ...
## $ school : chr "ALAMEDA CHRTN" "BAY FARM ELEM" "EARHART (AMELIA) ELEM" "EDISON ELEM" ...
## $ enrollment: num 12 78 77 56 41 75 40 80 61 49 ...
## $ complete : num 11 77 73 53 41 65 34 76 61 43 ...
## $ start_year: num 2001 2001 2001 2001 2001 ...
## - attr(*, "spec")=
## .. cols(
## .. district = col_character(),
## .. sch_code = col_double(),
## .. county = col_character(),
## .. pub_priv = col_character(),
## .. school = col_character(),
## .. enrollment = col_double(),
## .. complete = col_double(),
## .. start_year = col_double()
## .. )
This data contain 8 variabes and (originally) 110,382. The variables are: * district * sch_code * pub_priv * school * enrollment * complete * start_year
immune <- na.omit(immune)
immune <- mutate(immune, ratio_complete = (complete/enrollment)*100) #Creating a percentage for all completely immunized children.
completesum<-summary(immune$ratio_complete)
completesum
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 88.06 95.00 90.33 98.46 100.00
hist1<-hist(immune$ratio_complete, main = "", col = 3,
xlab = "Percentages of completely immunized students", breaks = 10)
First, let us create a percentage for the number of completely immunized students to total school population. Next, let us first find the top 5 counties, then filter for them. Also, let us create a percentage out of the number of children completely immunized to those enrolled. thousand_one<- filter(immune, start_year %in% c(“2001”, “2002”, “2003”, “2004”, “2005”, “2006”, “2007”, “2008”, “2009”, “2010”, “2011”, “2012”, “2013”, “2014”, “2015”))
top5<- immune %>%
group_by(county) %>%
summarize(enrolled = sum(enrollment, na.rm = TRUE)) %>%
arrange(desc(enrolled)) %>%
select(county)
immune1 <- immune %>%
mutate(ratio_complete = (complete/enrollment)*100) %>%
select(county, start_year, ratio_complete, enrollment, complete) %>%
filter(county %in% c("Los Angeles", "Orange", "San Diego", "San Bernardino", "Riverside"))
library(ggplot2)
library(RColorBrewer)
p1 <- ggplot(immune, aes(x = start_year, y = county, fill = ratio_complete)) +
geom_tile(aes(color = "grey50")) +
xlab("Year") +
scale_fill_gradient(low = "lightblue", high = "darkred") +
ggtitle("Percentages of completely immunized students per county (California)")
ggplotly(p1)
## Warning in matrix(g$fill_plotlyDomain, nrow = length(y), ncol =
## length(x), : data length [108732] is not a sub-multiple or multiple of the
## number of rows [58]
## Warning in matrix(g$hovertext, nrow = length(y), ncol = length(x), byrow =
## TRUE): data length [108732] is not a sub-multiple or multiple of the number
## of rows [58]
p2 <- ggplot(immune1, aes(x = start_year, y = county)) +
geom_tile(aes(fill = ratio_complete)) +
xlab("Year") +
scale_fill_gradient(low = "lightblue", high = "darkred") +
ggtitle("Percentages of completely immunized students by top 4 counties (California)")
ggplotly(p2)
## Warning in matrix(g$fill_plotlyDomain, nrow = length(y), ncol =
## length(x), : data length [55259] is not a sub-multiple or multiple of the
## number of rows [5]
## Warning in matrix(g$hovertext, nrow = length(y), ncol = length(x), byrow =
## TRUE): data length [55259] is not a sub-multiple or multiple of the number
## of rows [5]
p3 <- ggplot(immune, aes(x = start_year, y = pub_priv)) +
geom_tile(aes(fill = ratio_complete)) +
xlab("Year") +
scale_fill_gradient(low = "lightblue", high = "darkred") +
ggtitle("Percentages of completely immunized students: Public vs Private (California)")
ggplotly(p3)
## Warning in matrix(g$fill_plotlyDomain, nrow = length(y), ncol =
## length(x), : data length [108730] is not a sub-multiple or multiple of the
## number of columns [15]
## Warning in matrix(g$hovertext, nrow = length(y), ncol = length(x), byrow =
## TRUE): data length [108730] is not a sub-multiple or multiple of the number
## of columns [15]
colMeans(immune[,"ratio_complete"])
## ratio_complete
## 90.33241
thousand_one<- filter(immune, start_year %in% c("2001"))
hist2<-hist(thousand_one$ratio_complete, col = 3,
xlab = "Percentages of completely immunized students", breaks = 10)
thousand_eleven<- filter(immune, start_year %in% c("2011"))
hist2<-hist(thousand_eleven$ratio_complete, col = 3,
xlab = "Percentages of completely immunized students", breaks = 10)
This data comes from the state of California.